#Some imports
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn
Load the dataset and save it in a data frame.
from sklearn.datasets import load_breast_cancer
headers=['mean radius','mean texture','mean perimeter','mean area','mean smoothness','mean compactness','mean concavity','mean concave points','mean symmetry','mean fractal dimension','radius error','texture error','perimeter error','area error','smoothness error','compactness error','concavity error','concave points error','symmetry error','fractal dimension error','worst radius','worst texture','worst perimeter','worst area','worst smoothness','worst compactness','worst concavity','worst concave points','worst symmetry','worst fractal dimension']
X, y = load_breast_cancer(return_X_y=True,as_frame=True)
X.head()
data_set=X.copy()
data_set['label']=y
data_set.head()
Breast cancer is one of the most common cancers among women worldwide, representing the majority of new cancer cases and cancer-related deaths according to global statistics.
The Breast Cancer is publicly available and was created by Dr. William H. Wolberg, physician at the University Of Wisconsin Hospital at Madison, Wisconsin, USA. To create the dataset Dr. Wolberg used fluid samples, taken from patients with solid breast masses and an easy-to-use graphical computer program called Xcyt, which is capable of perform the analysis of cytological features based on a digital scan. The program uses a curve-fitting algorithm, to compute ten features from each one of the cells in the sample, than it calculates the mean value, extreme value and standard error of each feature for the image, returning a 30 real-valuated vector.
Ten real-valued features are computed for each cell nucleus:
radius (mean of distances from center to points on the perimeter)
texture (standard deviation of gray-scale values)
perimeter
area
smoothness (local variation in radius lengths)
compactness (perimeter² / area â€" 1.0)
concavity (severity of concave portions of the contour)
concave points (number of concave portions of the contour)
symmetry
fractal dimension (“coastline approximation†â€" 1)
The column 'Label' represent the disgnosis and is 0 if the tumor is classified as malignant, 1 if it is classified as benign.
The aim of this analysis is to perform a binary classification task and point out wich features are most relevant in predicting if the mass is malignant or bening.
First of all we have to check if there are some null values:
X.isnull().sum()
As we can se there are no null value.
Now I decide to print out some statistics of the data set:
X.describe()
Data seems to be well distributed among the values. Such information suggest us that in order to obtain good performance is necessary a standadization .
In the following plot we can see that the the labels are a bit unbalaced in favor of the class 'bening'. We have 357 records for the '1' class and 212 records for the label '0'. We have to keep in mind this information during the classification task in order to adopt a good stretegy as oversampling.
sns.set_theme(style="darkgrid")
sns.countplot(y, data=X,palette="Set2",)
plt.title('Malignant vs Benign', fontsize=14)
plt.show()
For the futher step i decide to plot some interesting results separatly for each kinf of values( means,worst,se) starting from the 'mean' values.
A good way to visualize the relationship between couples of features is the module pairplot() provided by seaborn.
#PAIRPLOTS FOR THE MEAN
list_mean=['mean radius','mean texture','mean perimeter','mean area','mean smoothness','mean compactness','mean concavity','mean concave points','mean symmetry','mean fractal dimension','label']
mean_data=data_set[list_mean]
mean_data.head()
# Implementing pairplot
g = sns.pairplot(mean_data, hue = 'label', markers=["o","D"],palette="Set2", diag_kind="hist")
g.map_offdiag(plt.scatter)
g.add_legend()
g.fig.suptitle('PairPolt for the mean', fontsize = 20)
From the pairplots we can note that for almost all the features the malignant tumors seem to have longer range, except for fractal dimension. The diagonal represents the distribution of just one feature among the label. We can see that the peak for malignant is reached leater. This means that malignant tumour tend to have larger radius, rougher texture, more compactness,concavity and concave points. We can also recogninze that some feaures have high correlations. For example perimeter with radius, area with radius and area with perimeter. For more specific information we need to compute the correlation matrix.
#PAIRPLOTS FOR SE
list_se=['radius error','texture error','perimeter error','area error','smoothness error','compactness error','concavity error','concave points error','symmetry error','fractal dimension error','label']
se_data=data_set[list_se]
g = sns.pairplot(se_data, hue = 'label', markers=["o","D"],palette="Set2", diag_kind="hist")
g.map_offdiag(plt.scatter)
g.add_legend()
g.fig.suptitle('PairPolt for the se', fontsize = 20)
Even if less obviously also in this case we can observe that the range of the malignant tends to higher values , and also the peak is reached later in the distribution plots.
#PAIRPLOTS FOR WORST
worst=['worst radius','worst texture','worst perimeter','worst area','worst smoothness','worst compactness','worst concavity','worst concave points','worst symmetry','worst fractal dimension','label']
worst_data=data_set[worst]
g = sns.pairplot(worst_data, hue = 'label', markers=["o","D"],palette="Set2", diag_kind="hist")
g.map_offdiag(plt.scatter)
g.add_legend()
g.fig.suptitle('PairPolt for Worst ', fontsize = 20)
For 'worst' we can do the same observation like for 'mean'.
As we could expect the behaivours for the three kind of data are quite the same .
A factor that could affect good performance of the analysis is the correlation between attributets beacuse most of classification algoritm showed in this work assume that all the predictors are indipendent. For this porpuse Persaons's coeficient is calculatated as followed for each couple of random vector (X,Y): $$\rho_{x,y}=\frac{Cov(X,Y)}{\sigma_x \sigma_y }$$
Correlation is a normalization of the covariance by $\sigma_x$,$\sigma_y$, the respectivily standard deviation of X,Y.This number is in range [-1,1] and it measures the strenght of the correlations between X and Y. The higher the module of the value the strongher the correlation. It's important to note that $\rho$ represent only linear correlations and so a value of $\rho$ near by zero doesn't indicate necessarly absence of correlation. Can also means strong non linear correlation.
In order to visualize the correlation between features I plot the following heatmap. The correlation is set to be in range [-1,1], and is max for dark blue.
We can observe that there are many features that have really high correlations and so a feature selection procedure for dimentsionality reducion is needed during preprocessing step.
#Heatmap for the correlation
cor_matrix=X.corr()
mask = np.triu(np.ones_like(cor_matrix, dtype=bool))
with sns.axes_style("white"):
f, ax = plt.subplots(figsize=(25,20))
ax = sns.heatmap(cor_matrix,mask=mask, square=True,cmap="YlGnBu",vmin=-1,vmax=1,annot=True)
In order to create a learning algorith able to classify instances, we need to create a training set S, sampled from the entire dataset D, and to train our model triyng to minimize the training error. This approach is based on the Empirical Risk Minimization paradigm, that asserts that if S is representative of D, minimizing the error on S we are also minimizing the error on D. So I split the dataset in two portion with the percentage 70% for the training set and 30%for the evaluation set. In order to have both the two sets with approximately the same percentage of bening and milignant samples I adopt a statified strategy.
It has to be considered that the split is randomly performed but for an effective comparison across the different classifier I decide to always keep the same one.
from sklearn.model_selection import train_test_split
X_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3, stratify=y)
The data exploration phase shows that features has widespread range of values , with different means and standard deviations. That can affetc the correctess of the classification task beacuse features with grater variance can influence more the objective function. In addition, many cassifications algorithm assume that data are zero centerend and with unit variance. So i conclude that is necessary to adopt a standardization tecnique.
So indipendently for each feature I subtract the mean and divide for the standart devitaion of the current feature.
The mean and the standard deviation are compunted only taking into account the training-set, and the transformation is then applied to the test-set because usally in the initial phase the test set is unknown. It's formally uncorrected taking into accound during training procedure any kind of information relative with the test set.
In the plot below we can visualize the standardized outputs.
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
x_test=ss.transform(x_test)
X_train=pd.DataFrame(X_train,columns=headers)
x_test=pd.DataFrame(x_test,columns=headers)
plt.figure(figsize=(25, 20))
sns.displot(data=X_train,kind="kde", fill=True,height=15)
Feature selection is the process of reducing the number of input variables when developing a predictive model.
It is desirable to reduce the number of input variables to both reduce the computational cost of modeling and, in some cases, to improve the performance of the model.
Statistical-based feature selection methods involve evaluating the relationship between each input variable and the target variable using statistics and selecting those input variables that have the strongest relationship with the target variable. These methods can be fast and effective, although the choice of statistical measures depends on the data type of both the input and output variables. There are 2 different type of feature selections algorithms:
unsupervised features selection:Do not use the target variable (e.g. remove redundant variables thanks to Pearson's correlations coefincent);
supervised features selection: use the target variable and choose the best k in the specific classification tack)
I decide to try both the methond and make comparison between the resuting model.
As we had observed in the correlation heatmap, in the dataset there are many features that are strongly linear correlated. It is not meaningfull to keep all of them into account. I remove featurs with person's coeficients >=0.95 with other predictors.
corr=X_train.corr()
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
for j in range(i+1, corr.shape[0]):
if corr.iloc[i,j] >= 0.95:
if columns[j]:
columns[j] = False
selected_columns = X_train.columns[columns]
unsup_df=X_train[selected_columns]
print(selected_columns)
unsup_test=x_test[selected_columns]
print(len(selected_columns))
The selected features are 'radius_mean', 'texture_mean', 'smoothness_mean', 'compactness_mean','concavity_mean', 'concave points_mean', 'symmetry_mean','fractal_dimension_mean', 'radius_se\ttexture_se','perimeter_se\tarea_se', 'compactness_se', 'concavity_se', 'concave','points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst','perimeter_worst', 'compactness_worst', 'concavity_worst', 'concave','points_worst', 'symmetry_worst', 'fractal_dimension_worst'. The resulting DataFrame object has only 23 features.
In supervised features selection, there is a different approach: we need to set a priori the hyperparameter K( number of best attribute desired) and than we can evaluate how good the model performs only when the classification models gives its results. And also the hyperparametes of the classification models need to be tuned. So, at each step, we need to define both K and model hyperparameters.
I choose the the RFE module provided by sklearn. Given an external estimator that assigns weights to features (e.g., the coefficients of a linear model), the goal of recursive feature elimination (RFE) is to select features by recursively considering smaller and smaller sets of features. First, the estimator is trained on the initial set of features and the importance of each feature is obtained. In this case I use the precision score of the classification in order to choose the best features set.(better explained later). Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.
This bias (caused by umbalance in the label count ) in the training dataset can influence many machine learning algorithms, leading some to ignore the minority class entirely. This is a problem as it is typically the minority class on which predictions are most important.
One approach to addressing the problem of class imbalance is to randomly resample the training dataset. The two main approaches to randomly resampling an imbalanced dataset are to delete examples from the majority class, called undersampling, and to duplicate examples from the minority class, called oversampling.
Usally oversampling is a more stable choiche beacuse guarant to don't lose any information in the training set. In this specific case I adopt oversampling according to the size of my data set.
I used the SMOTE module provided by imblearn, SMOTE randomly selects a minority class istance point and finds its k nearest minority class neighbors, then it creates new artificial samples along the lines between the selected point and one of the neighbors.
Of course the oversempling tecnique is adopted only on the training set.
from imblearn.over_sampling import SMOTE
X_SMOTE, y_SMOTE = SMOTE().fit_resample(X_train, y_train)
print("Oversampling on X generate a dataset of", X_SMOTE.shape, "istances, while the original dataset had ",len( unsup_df), " instances")
print( 'The new dataset has' , np.count_nonzero(y_SMOTE == 0), 'elements for bening, and', np.count_nonzero(y_SMOTE == 1), 'elements for malignant.')
smote_df=pd.DataFrame(X_SMOTE,columns=headers)
smote_df.describe()
X_SMOTE_unsup,y_SMOTE_unsup=SMOTE().fit_resample(unsup_df, y_train)
X_SMOTE.head()
In a general classification problem, the goal is to learn a classifier that performs well on unseen data drawn from the same distribution as the available data; in other words, to learn classifiers with good generalization. One common way to estimate generalization capabilities is to measure the performance of the learned classifier on test data that has not been used to train the classifier. In order to compare models and find the one that performs better, it's needed to define which metrics will be used to evaluate performances.
Let: $TP$ = True positives, $TN$ = True negative, $FP$ = False positive, $FN$ = False negative. For this task I choose the following metrics:
Accuracy = $\frac{TP + TN}{TP+TN + FP + FN} = \frac{\text{Number of correct predictions}}{\text{Total number of predictions}}$
Precision = $\frac{\text{TP}}{\text{TP + FP}}$
Recall = $\frac{\text{TP}}{\text{TP + FN}}$
F1 Score = $2 \cdot \frac{\text{Precision}\cdot \text{Sensitivity}}{\text{Precision + Sensitivity}} $
Note that since we want to detect cancer everytime there is one, we might focus on having a good precision rather than a good accuracy or recall beacuse the worst case in real life is a case of false positive, that is a case in which the diagnosis is for bening mass but it's not.
For all the following alghorithm is adopted a stratified k-fold corss validation stategy for tuning the hyperparameters that consist in splitting the training set in k partition , train the model on k-1 partion and finally evaluate it on the remaining one. In order to choose the best configuration I used the precision score.
After tuning I test the model on the test set and plot the discussed metrics.
The decsion tree is the one of the intuitive and interpretable supervised learning. It's a model that predicts the class of the target variable by learning simple decsion rule inferred form data.
The predictor space, the set of possible values X1,X2,...,Xj, is divided in j non-overlapping regionsR1,R2,...Rj(these regions could have any shape but high dimensional rectangles are chosen for simplicity of the interpretation of the resulting predictive model). Since it is computationally very expensive to divide in every possible partition the space, a top down, greedy approach is used: the algorithm starts form a first node in the tree and then at each step the predictor space is split and this split is performed by choosing the best one in that level of the tree rather than looking ahead to find the split that will lead to a better final result. In order to choose the best split different types of impurity measure of the nodes are defined. For this work i used the Gini index. Gini index computes the impurity of the nodes as 1- the squared frequencies of class k appereing at node t.It's a sort of measure of the probability of a particular variable being wrongly classified when it is randomly choosen.
A decsion tree is computed form the training data and then tested on the test set. I tuned different hyperparamether with the stratified k-fold cross validation using k=5 . This tuning phase also include the tuning of the number of best features selceted with RFE.
In order to make comparison between the two different kind of feature selection I also fine tuned the model with the training-set obtained with unsupervised features selections. For the tuning phase the crucial metrics is Precision.
With supervised Fetaure selection the best configuration is reached with :
'mean concavity', 'mean concave points', 'mean symmetry',
'concave points error', 'symmetry error', 'fractal dimension error',
'worst radius', 'worst texture', 'worst perimeter', 'worst area',
'worst smoothness'
The confusion matrix calculated on the test set is: ;
With unsupervised features selection the best configuration is reached:
The confusion matrix calculated on the test set is:
As we can clearly note the best configuation reached with supervised feature selection also leads to better performance in terms of all the scoreses. On more important thing is that with unsupervised features selection we obtain the best configuation just thanks to 11 features (against the 23 of the unsupervised method). This means a more interpretable and in term of computation efficience model.
(The code is reported at the end of the colab notebook for space reasons)
A random forest is an estimator that fits a number of decision tree classifiers(ensamble of tree) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The key point of this method is that large number of relatively uncorrelated models (decision tree) will leads to better performance than any of the insividual model. The uncorrelation between model is the main point of the random forest and it's relaized thanks to two different method:
In conclusion we can say that a random forest is a more stable method because the overall (over all the trees) prediction has a lower variance. That's beacuse if we take n indipendent observations each with know variance, the variance of the mean of the observations is lower than the single observation ones ( because is divided by n the number of osservations).
A Random Forest is computed from the training data and then tested on the test set. I tuned different hyperparamether with the stratified k-fold cross validation using k=5 . This tuning phase also include the tuning of the number of best features selceted with RFE.
In order to make comparison between the two different kind of feature selection I also fine tuned the model with the training-set obtained with unsupervised features selections. For the tuning phase the crucial metrics is Precision.
With supervised Feature selection the best configuration is reached with :
'mean radius', 'mean texture', 'mean perimeter', 'mean area',
'mean smoothness', 'mean concavity', 'mean concave points',
'area error', 'smoothness error', 'concavity error',
'fractal dimension error', 'worst radius', 'worst texture',
'worst perimeter', 'worst area', 'worst smoothness',
'worst compactness', 'worst concavity', 'worst concave points',
'worst symmetry', 'worst fractal dimension'
The confusion matrix calculated on the test set is :
With unsupervised features selection the best configuration is reached with:
The confusion matrix calculated on the test set is:
As expected the Random forest outperform the decsion tree in terms of all the scores. And as in the decsion Tree the best performance is reached with supervised feature selection. Note that with the Random Forest the best number of features is 21, bigger than the one selected with decsion tree. Thats reasonable because the Random forest has the chance to take into account different subsets of features and so it's possible that a more complete set of features is representative.
The linear SVM goal in its hard margin formulation is to find a hyperplane in the feature space that correctly separates all the classes with the largest possible margin(the margine is the distance between the hyperplane and the nearest points of each class). The equation of the plan is $w^T +b=0$ where $\vec{w},b$ are define in order to maximize its distance to the nearest points from either group. We can also note that the plans defined by the the support vector for a binary classification has respectively the equations: $w^T +b=1$ and $w^T +b=-1$, where the margin is given by $\frac{2}{||w||}$. So we can define the primal optimization problem as: $$min||\vec{w}||^2$$ given the constrain that $y_i(w^t+b)>1$
This model assume that data are linearly separable otherwise no feasible solution is found.
A first way to extend SVM to cases where data are not linearlly separable is the soft-margin SVM.The point is to allow to some of the data to lay between the separating hyperplane and their marginal hyperplane, or even being put on the wrong side of the separating hyperplane. In order to do that we add a slack variable $ξ_i$ to the problem constraints: $$y_i(w^t+b)>1-ξ_i$$ with $$ξ_i>=0$$
Now our optimization problem has changed from “find an hyperplane with the widest margin possible stated that every point is properly classified†to “find an hyperplane with a wide enough margin taking into account that we also want to minimize the sum of the slack†(which means that we want to put as many points as possible on the right side of their marginal hyperplane). In other words, our optimization problem is the following: $$min_{ w,b}+\frac{1}{2}‖w‖ 2 + C âˆ'_1^n ξ_ii$$ Given the above constraints. As we can see the parameter C regulates the tradeoff between obtaining a large margin and do not misclassifying training points.
With all the previus hypotesis (standardization, oversampling) with a stratify k-fold cross validation with k=5 the model is fine tuned for the paramether C and two approch in study: unsupervised feature selection and supervides feature selection.
With supervised Fetaure selection the best configuration is reached with :
Best features :
'mean area', 'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry', 'mean fractal dimension',
'texture error', 'area error', 'smoothness error', 'compactness error',
'fractal dimension error', 'worst radius', 'worst texture',
'worst perimeter', 'worst area', 'worst concavity',
'worst concave points', 'worst fractal dimension'
The confusion matrix calculted on the test set is :
With unsupervised features selection the best score is reached with :
The confusion matrix of the predicrion of the test set is:
As we can see in this case the result are more similar and there are no significative difference between the two features selection tecnique. The global performance of the linear SVM classifier is good but non high as the ones for the Random Forest classifier.
The goal of an SVM algorithm as we saw is to find a hyperplane that could separate different classes of samples in their feature space with the largest possible margin. It is possible to enforce hard constraints on the correct classification of the samples or soften those constraints by adding a slack variable, but still SVM is only able to define linear boundaries. When the classification problem is non-linear, the ideal would be to find some non-linear decision boundaries. A simple solution is to map our data in a higher dimensional space where they are linearly separable and then apply an SVM estimator. So, given a mapping function φ we want to solve the following optimization problem $$min_{ w,b}+\frac{1}{2}‖w‖ 2 + C âˆ'_1^n ξ_ii$$ constrained by
$$y_i (φ(x i ) w^T + b) ≥ 1 âˆ' ξ i$$$$ξ_i ≥ 0$$Unfortunately finding such $φ$ is not an easy task and, even when we find it, the mapping could be computational heavy or even unfeasible. There is a method to avoid the explicit computation of the mapping, and that is the kernel trick. The optimal $w, b $ we are searching in our problem (the same applies for the linear SVM) can be found by mean of the Karush-Kuhn-Tucker theorem that is a generalization of the method of Lagrange multipliers (which allows only equality constraints). The theorem states that an optimal point (not necessarily the global optimum) for the problem is a saddle point of its Lagrange function representation and for that point the Karush-Kuhn-Tucker conditions hold. In our case, with slack variable and non-linearity,the Lagrange function would be $$L(w, b, ξ , α, η) = \frac{1}{2}‖w‖+ C âˆ' ξ i âˆ' âˆ' α_i [y_i (φ(x_i ) w^T+ b) âˆ' 1 + ξ_i ] âˆ' âˆ'η_i ξ_i$$
We have two ways to find this saddle point, we can maximize $L(w, b, ξ , α, η)$ with respect to $α, η $ and then minimize with respect to $ w, b, ξ $ and that is the primal optimization problem, or first minimize $L(w, b, ξ , α, η)$ with respect to $w, b, ξ$ and then maximize with respect to $α, η$ and this solves the dual optimization problem. In general this two approaches do not necessarily lead to the same solution (the dual problem provides a lower bound for the solution of the primal) but since our problem is convex, because is about minimizing a quadratic objective function with linear constraints, and it admits a solution that is subject to all the constraints with strict inequality (Slater’s condition) then the primal and the dual problem give the same solution (strong duality). Moreover, the KKT conditions are not only necessarily but also sufficient for the solution to be a global optimum. The fact that we can solve the problem in its dual or primal formulation indifferently is particularly useful because the dual problem depends only on the dot product of the samples.
$$max_α âˆ' âˆ' α_i α_j y_i y_j φ(x_i ) φ(x_j )^T + âˆ' α_i$$subject to
$$âˆ' α_i y_i = 0 \ with \ α_i ∈ [0 , C]$$i This formulation allows us to exploit the above-mentioned kernel trick. Instead of mapping the features in a higher space and then compute the dot product, we design a kernel function $$K(x_i , x_j ) = φ(x_i ) φ(x_j )^T$$ That allow us to compute the dot products without leaving the original dimensional space of the features, even without knowing the mapping function φ that could be, like in the case of the RBF kernel used in this homework, infinite-dimensional. The RBF kernel is defined as follows. $$K(x i , x j ) = e^{âˆ'\frac{‖x_i âˆ'x_j ‖}{2\sigma ^2}}$$
An equivalent definition is $$K(x i , x j ) = e^{-\gamma‖x_i âˆ'x_j ‖}$$
That use the parameter $\gamma=\frac{1}{2\sigma ^2}$ As we see the RBF kernel is in the form of a Gaussian function and the $γ $ parameter is is inversely proportional to its variance. The effect of γ can be intuitively comprehended by observing that K could be seen as a similarity measure, because its value is between 0 (in the limit) and 1 (when x_i = x_j ) and decreases with the distance between x_i and x_j . Then, when $γ$ is small $ K(x_i , x_j ) $ tend to be 1 for every couple of samples $ (i ,j)$, this means that all of our data are basically the same, so the classifier struggles to discriminate different classes of points. At the same time when $γ$ is too big $K(x_i , x_j )$ tend to be 0 which means that every sample is unique, and this makes our estimator unable to generalize. Another interpretation of $γ$ which is related to the one already presented is that it can be seen as the inverse of the radius of influence of a support vector. It means that with low values of $γ$ each support vector will have influence on the decision of the class in a wider space around it, with the result of less curve boundaries. On the other hand, high values of $γ$ means that each support vector influence accounts only in a region very close to it, and that determinates much more curve boundaries that try to mimic the shape of the classes in the feature space.
So as with all the other classifier I train the model with just unsupervised features selection strategy. When using kernel is not easy evaluate the goodness of features during the classification because mapped in a new space and also the module of sklearn RFECV doesn't support this configuration. With k-fold cross validation strategy I tuned both C and gamma.
The best configuration is reached with :
The confusion matrix is :
In cocnlusion the result are similar to the one achived with the linear kernel, but the best case for this classification task remain Random Forest.
The logistic regression in a generalization of linear regression that allow to solve binary classification problems. In order to execute this generalization, the steps are:
I trained the model after supervised features selection trying both the l1, and l2 loss function and tuning the paramether C. The best configuration is reached with:
With supervised feature selection the best configuration is reached with :
'mean radius', 'mean texture', 'mean area', 'mean concavity',
'mean concave points', 'mean fractal dimension', 'area error',
'compactness error', 'worst radius', 'worst texture', 'worst perimeter',
'worst area', 'worst smoothness', 'worst concave points',
'worst symmetry'
The confusion matrix on the test set is:
With the unsupervised features selction training set the best configuration is :
The confusion matrix get testing is:
In this case the scores are quite similar but of course using 15 features instead of 23 is computational more efficient and it's always a good choiche if there is no loss in term of performance.
K-Nearest Neighbors is classification alghorithm
Neighbors-based classification is a type of instance-based learning or non-generalizing learning: it does not attempt to construct a general internal model, but simply stores instances of the training data. Classification is computed from a simple majority vote of the nearest neighbors of each point.
The closness can be avaluated with different method. The principal are:
Some consideration on KNN:
The only paramether that has to be tuned is the K.
In order to apply a supervised feature selection for the KNN algorithm is necessary to use an external estimator that assigns weights to features. For this reason I choose the feature selected with the decision tree classifier in his best configuration on this task.
So I train the model on the following features :
'mean concavity', 'mean concave points', 'mean symmetry', 'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness'
The best number of neighbord is 20.
In this configuration the confusion matrix obtained testion on the test set is:
With the unsupervised features selection settings the best configuration is reached with K=3,and testion it on the test set the following confusion matrix is obteined:
As we can see the improvment this time is almost remarkable. This behaivour is giustify by the fact that KNN classifier is strongly sensible to dimensionality.
As we can see for this specific binary classification task the best score reached testing on the test set is reached with Random Forest Classifier that has been improved using the supervised features selection. In all the other case the supervised feature selection tecninque doesn't show any particular improvment as expected. Is important to remark that feature selection is usefull in order to avoude the so called caurse of dimensionality. And so reduce the number of attributes withouth register loss in performance is still a good result.
However all the method achive good performance but this is also due to the dataset wich has been constructed for the same porpuse of the analysis.
Finally we can observe that even if the dataset is composed by 30 feature, also 10/15 of them are usefull for reach good performance. This dimensionality reduction leads of course to computationally improvement but also in term of interpretability of the model.
head_tree=['mean concavity', 'mean concave points', 'mean symmetry',
'concave points error', 'symmetry error', 'fractal dimension error',
'worst radius', 'worst texture', 'worst perimeter', 'worst area',
'worst smoothness']
X_SMOTE[head_tree]
def build_classifier(X_train, y_train, X_test, y_test, clf_to_evaluate, scores, param_grid, n_folds=3 ):
print("# Tuning hyper-parameters for %s" % score)
print()
clf = GridSearchCV(clf_to_evaluate, param_grid, cv=n_folds,
scoring=score, verbose=True, n_jobs=4, iid=False)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()
return clf
#Train Tree with unsupervised features selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
max_deph=[5,10,15,20]
num_estimators=[10,20,30,40]
best=0
bd=0
mf=0
k=0
num_best=0
for d in max_deph:
for f in num_estimators:
clf =RandomForestClassifier(random_state=42,max_depth=d,n_estimators=f)
clf.fit(X_SMOTE_unsup,y_SMOTE_unsup)
print(clf.get_params)
recall = recall_score(y_test,clf.predict(unsup_test))
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,clf.predict(unsup_test))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,clf.predict(unsup_test))
print('F1 score is: ', f1)
cm = confusion_matrix(y_test,clf.predict(unsup_test))
pre = precision_score(y_test,clf.predict(unsup_test))
print('Precision is:' ,pre)
plt.figure()
sns.heatmap(cm,annot=True,fmt="d")
plt.show()
if pre>=best:
best=precision_score(y_test,clf.predict(unsup_test))
bd=d
mf=f
num_best=rfecv.n_features_
print('CONCLUSION:')
print('best precision is:',best)
print('max_deph:',bd)
print('number_estimators',mf)
#TRAIN WITH RFE On RF
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
max_deph=[5,10,15,20]
num_estimators=[10,20,30,40]
best=0
bd=0
mf=0
k=0
num_best=0
best_features=[]
for d in max_deph:
for f in num_estimators:
clf =RandomForestClassifier(random_state=42,max_depth=d,n_estimators=f)
rfecv = RFECV(estimator=clf, step=1, cv=5, scoring='precision') #5-fold cross-validation
rfecv = rfecv.fit(X_SMOTE, y_SMOTE)
print(clf.get_params)
print('Optimal number of features :', rfecv.n_features_)
print('Best features :', smote_df.columns[rfecv.support_])
recall = recall_score(y_test,rfecv.predict(x_test))
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,rfecv.predict(x_test))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,rfecv.predict(x_test))
print('F1 score is: ', f1)
cm = confusion_matrix(y_test,rfecv.predict(x_test))
pre = precision_score(y_test,rfecv.predict(x_test))
print('Precision is:' ,pre)
if pre>=best:
best=precision_score(y_test,rfecv.predict(x_test))
bd=d
mf=f
num_best=rfecv.n_features_
best_features=smote_df.columns[rfecv.support_]
plt.figure()
sns.heatmap(cm,annot=True,fmt="d")
plt.show()
print('CONCLUSION:')
print('best precision is:',best)
print('max_deph:',bd)
print('num_of_estimato:',mf)
print('num_best_features',num_best)
print('best_features=',best_features)
#Unsupervised feature selection with KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
C=[3,10,20]
for d in C:
clf =KNeighborsClassifier(n_neighbors=d)
clf.fit(X_SMOTE_unsup,y_SMOTE_unsup)
print(clf.get_params)
recall = recall_score(y_test,clf.predict(unsup_test))
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,clf.predict(unsup_test))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,clf.predict(unsup_test))
print('F1 score is: ', f1)
cm = confusion_matrix(y_test,clf.predict(unsup_test))
pre = precision_score(y_test,clf.predict(unsup_test))
print('Precision is:' ,pre)
plt.figure()
sns.heatmap(cm,annot=True,fmt="d")
plt.show()
#Train supervised feature selection KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
head_tree=['mean concavity', 'mean concave points', 'mean symmetry',
'concave points error', 'symmetry error', 'fractal dimension error',
'worst radius', 'worst texture', 'worst perimeter', 'worst area',
'worst smoothness']
C=[3,10,20]
for k in C:
clf=KNeighborsClassifier(n_neighbors=k)
clf.fit(X_SMOTE[head_tree],y_SMOTE)
recall = recall_score(y_test,clf.predict(x_test[head_tree]))
pre = precision_score(y_test,clf.predict(x_test[head_tree]))
print('Precision is:' ,pre)
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,clf.predict(x_test[head_tree]))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,clf.predict(x_test[head_tree]))
print('F1 score is: ', f1)
print(k)
cm = confusion_matrix(y_test,clf.predict(x_test[head_tree]))
pre = precision_score(y_test,clf.predict(x_test[head_tree]))
plt.figure()
sns.heatmap(cm,annot=True,fmt="d")
plt.show()
#Train Tree with unsupervised features selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
max_deph=[5,10,15]
max_features=['sqrt','log2']
best=0
bd=0
mf=0
k=0
num_best=0
for d in max_deph:
for f in max_features:
clf=DecisionTreeClassifier(random_state=42,max_depth=d,max_features=f)
clf.fit(X_SMOTE_unsup,y_SMOTE_unsup)
print(clf.get_params)
recall = recall_score(y_test,clf.predict(unsup_test))
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,clf.predict(unsup_test))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,clf.predict(unsup_test))
print('F1 score is: ', f1)
cm = confusion_matrix(y_test,clf.predict(unsup_test))
pre = precision_score(y_test,clf.predict(unsup_test))
print('Precision is:' ,pre)
plt.figure()
sns.heatmap(cm,annot=True,fmt="d")
plt.show()
if pre>=best:
best=precision_score(y_test,clf.predict(unsup_test))
bd=d
mf=f
num_best=rfecv.n_features_
print('CONCLUSION:')
print('best precision is:',best)
print('max_deph:',bd)
print('max_features:',mf)
#Train unsupervised features selction wit Logistic
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
C=[0.1,1,10]
penality=['l1','l2']
p=0
bd=0
best_gamma=0
for d in C:
for g in penality:
clf =LogisticRegression(solver='saga',random_state=42,penalty=g,C=d)
clf.fit(smote_df.columns[rfecv.support_],y_SMOTE)
print(clf.get_params)
recall = recall_score(y_test,clf.predict(unsup_test))
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,clf.predict(unsup_test))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,clf.predict(unsup_test))
print('F1 score is: ', f1)
cm = confusion_matrix(y_test,clf.predict(unsup_test))
pre = precision_score(y_test,clf.predict(unsup_test))
print('Precision is:' ,pre)
plt.figure()
sns.heatmap(cm,annot=True,fmt="d")
plt.show()
#TRAIN WITH RFE Logistic
from sklearn.feature_selection import RFE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
penality=['l1', 'l2']
C=[0.1,1,10]
num_best=0
best_features=[]
for c in C:
for l in penality:
clf=LogisticRegression(random_state=42,penalty=l,C=c,solver='saga')
print(c,l)
rfecv = RFE(estimator=clf) #5-fold cross-validation
rfecv = rfecv.fit(X_SMOTE, y_SMOTE)
print(smote_df.columns[rfecv.support_])
from matplotlib.colors import ListedColormap
from mlxtend.plotting import plot_decision_regions
def plot_decision_boundaries(X_train,Y_train,model,stringa):
h=0.07
cmap_light = ListedColormap(['gold', 'mediumturquoise'])
cmap_bold = ListedColormap(['orange', 'teal',])
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(figsize=(7,7))
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
plt.scatter(X_train[:, 0], X_train[:, 1], c=Y_train, cmap=cmap_bold,
edgecolor='k', s=70)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title(stringa)
plt.show()
X_SMOTE.head()
hd=['mean radius','mean smoothness']
stringa='Dacision boundaries for mean area and mean smoothness, with C=0.1 ,LinearSVC'
cazzo=X_train[hd]
cazzo.to_numpy
print(cazzo.to_numpy)
b=cazzo.reset_index().values
c=y_train.values
clf=LinearSVC(C=0.1,random_state=42)
clf.fit(cazzo,y_train)
plot_decision_boundaries(b,c,clf,stringa)
#TRAIN WITH RFE On SMVLinear
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
C=[0.1,1,10,100,1000,1000]
best=0
bd=0
k=0
num_best=0
best_features=[]
for d in C:
clf =LinearSVC(random_state=42,C=d)
rfecv = RFECV(estimator=clf, step=1, cv=5, scoring='f1') #5-fold cross-validation
rfecv = rfecv.fit(X_SMOTE, y_SMOTE)
print(clf.get_params)
print('Optimal number of features :', rfecv.n_features_)
print('Best features :', smote_df.columns[rfecv.support_])
recall = recall_score(y_test,rfecv.predict(x_test))
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,rfecv.predict(x_test))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,rfecv.predict(x_test))
print('F1 score is: ', f1)
cm = confusion_matrix(y_test,rfecv.predict(x_test))
pre = precision_score(y_test,rfecv.predict(x_test))
print('Precision is:' ,pre)
plt.figure()
sns.heatmap(cm,annot=True,fmt="d")
plt.show()
if pre>=best:
best=precision_score(y_test,rfecv.predict(x_test))
bd=d
num_best=rfecv.n_features_
best_features=smote_df.columns[rfecv.support_]
print('CONCLUSION:')
print('best precision is:',best)
print('Best C:',bd)
print('num_best_features',num_best)
print('best_features=',best_features)
sklearn.metrics.SCORERS.keys()
#Train LINEAR SVM with unsupervised features selection
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
C=[0.1,1,10,100,1000,1000]
best=0
bd=0
mf=0
k=0
num_best=0
for d in C:
clf =LinearSVC(random_state=42,C=d)
clf.fit(X_SMOTE_unsup,y_SMOTE_unsup)
print(clf.get_params)
recall = recall_score(y_test,clf.predict(unsup_test))
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,clf.predict(unsup_test))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,clf.predict(unsup_test))
print('F1 score is: ', f1)
cm = confusion_matrix(y_test,clf.predict(unsup_test))
pre = precision_score(y_test,clf.predict(unsup_test))
print('Precision is:' ,pre)
plt.figure()
sns.heatmap(cm,annot=True,fmt="d")
plt.show()
if pre>=best:
best=precision_score(y_test,clf.predict(unsup_test))
bd=d
print('CONCLUSION:')
print('best precision is:',best)
print('Best C:',bd)
#TRAIN WITH RFE On decsion TREE
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
max_deph=[5,10,15]
max_features=['sqrt','log2']
best=0
bd=0
mf=0
k=0
num_best=0
best_features=[]
for d in max_deph:
for f in max_features:
clf =DecisionTreeClassifier(random_state=42,max_depth=d,max_features=f)
rfecv = RFECV(estimator=clf, step=1, cv=5, scoring='precision') #5-fold cross-validation
rfecv = rfecv.fit(X_SMOTE, y_SMOTE)
print(clf.get_params)
print('Optimal number of features :', rfecv.n_features_)
print('Best features :', smote_df.columns[rfecv.support_])
recall = recall_score(y_test,rfecv.predict(x_test))
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,rfecv.predict(x_test))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,rfecv.predict(x_test))
print('F1 score is: ', f1)
cm = confusion_matrix(y_test,rfecv.predict(x_test))
pre = precision_score(y_test,rfecv.predict(x_test))
print('Precision is:' ,pre)
if pre>=best:
best=precision_score(y_test,rfecv.predict(x_test))
bd=d
mf=f
num_best=rfecv.n_features_
best_features=smote_df.columns[rfecv.support_]
plt.figure()
sns.heatmap(cm,annot=True,fmt="d")
plt.show()
print('CONCLUSION:')
print('best precision is:',best)
print('max_deph:',bd)
print('max_features:',mf)
print('num_best_features',num_best)
print('best_features=',best_features)
#Train LINEAR SVM with unsupervised features selection
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
C=[0.1,1,10,100,1000,1000]
gamma=[0.001,0.01,0.1,1,10]
p=0
bd=0
best_gamma=0
for d in C:
for g in gamma:
clf =SVC(random_state=42,C=d,gamma=g,kernel='rbf')
clf.fit(X_SMOTE_unsup,y_SMOTE_unsup)
print(clf.get_params)
recall = recall_score(y_test,clf.predict(unsup_test))
print('Recall is: ', recall)
accuracy = accuracy_score(y_test,clf.predict(unsup_test))
print('Accuracy is: ', accuracy)
f1 = f1_score(y_test,clf.predict(unsup_test))
print('F1 score is: ', f1)
cm = confusion_matrix(y_test,clf.predict(unsup_test))
pre = precision_score(y_test,clf.predict(unsup_test))
print('Precision is:' ,pre)
plt.figure()
sns.heatmap(cm,annot=True,fmt="d")
plt.show()
if accuracy>=p:
p=accuracy
bd=d
best_gamma=g
print('CONCLUSION:')
print('best precision is:',best)
print('Best C:',bd)
print('Best gamma',g)